#https://blogs.rstudio.com/tensorflow/posts/2017-12-22-word-embeddings-with-keras/
current_wd <- getwd()
current_wd
current_wd <- ""
setwd(current_wd)
############################################################################
########
############################################################################
########
############################################################################
########
############################################################################
########
#download.file("https://snap.stanford.edu/data/finefoods.txt.gz","finefoods.txt.gz")
library(dplyr)
library(readr)
library(stringr)
library(quanteda)
d_data <- read_lines("lab_pre.csv")
d_data <- d_data[str_sub(d_data, 1, 12) == "review/text:"]
d_data <- str_sub(d_data, start = 14)
d_data <- iconv(d_data, to = "UTF-8")
unique_words <- length(unique(unlist(str_split(d_data[1:200], " "))))
unique_words
############################################################################
########
############################################################################
########
############################################################################
########
############################################################################
######## We’ll begin with some text pre-processing using a keras text_tokenizer().
#The tokenizer will be responsible for transforming each review into a sequence of integer 
#tokens (which will subsequently be used as input into the skip-gram model).

library(tensorflow)
#install_tensorflow(version = "gpu")
library(keras)


#install_keras(tensorflow = "gpu")
tokenizer <- tex_tokenizer(num_words = 2000)
tokenizer %>% fit_text_tokenizer(d_data)
############################################################################
######## Note that the tokenizer object is modified in place by the call to fit_text_tokenizer(). 
#An integer token will be assigned for each of the 20,000 most common words 
#(the other words will be assigned to token 0).
############################################################################
########
############################################################################
########In the skip-gram model we will use each word as input to a log-linear classifier with a 
#projection layer, then predict words within a certain range before and after this word. 
#It would be very computationally expensive to output a probability distribution over all the
#vocabulary for each target word we input into the model. Instead, we are going to use negative 
#sampling, meaning we will sample some words that don’t appear in the context and train a binary 
#classifier to predict if the context word we passed is truly from the context or not.
############################################################################
######## In more practical terms, for the skip-gram model we will input a 1d integer vector of the target word tokens and a 1d integer vector of sampled context word tokens. We will generate a prediction of 1 if the sampled word really appeared in the context and 0 if it didn’t.
#We will now define a generator function to yield batches for model training.
library(reticulate)
library(purrr)
#https://becominghuman.ai/how-does-word2vecs-skip-gram-work-f92e0525def4
skipgrams_generator <-
  function(text,
           tokenizer,
           window_size,
           negative_samples) {
    gen <- texts_to_sequences_generator(tokenizer, sample(text))
    function() {
      skip <- generator_next(gen) %>%
        skipgrams(
          vocabulary_size = tokenizer$num_words,
          window_size = window_size,
          negative_samples = 1
        )
      x <-
        transpose(skip$couples) %>% map(. %>% unlist %>% as.matrix(ncol = 1))
      y <- skip$labels %>% as.matrix(ncol = 1)
      list(x, y)
    }
  }

############################################################################
######## A generator function is a function that returns a different value each time it is 
#called (generator functions are often used to provide streaming or dynamic data for training models). 
#Our generator function will receive a vector of texts, a tokenizer and the arguments for the skip-gram (the size of the window around each target word we examine and how many negative samples we want to sample for each target word).
############################################################################
########
############################################################################
########Now let’s start defining the keras model. We will use the Keras functional API.
############################################################################
########
embedding_size <- 128 # Dimension of the embedding vector.
skip_window <- 5 # How many words to consider left and right.
num_sampled <-1 # Number of negative examples to sample for each word. (We will first write placeholders for the inputs using the layer_input function.)
#input_target <- layer_input(shape = 1)
#input_context <- layer_input(shape = 1)
#Now let’s define the embedding matrix. The embedding is a matrix with dimensions (vocabulary, embedding_size) that acts as lookup table for the word vectors.
embedding <- layer_embedding(
  input_dim = tokenizer$num_words + 1,
  output_dim = embedding_size,
  input_length = 1,
  name = "embedding"
)
#target_vector <- input_target %>%
 # embedding() %>%
  #layer_flatten()
#context_vector <- input_context %>%
 # embedding() %>%
  #layer_flatten()
# The next step is to define how the target_vector will be related to the context_vector in order to make our network output 1 when the context word really appeared in the context and 0 otherwise. We want target_vector to be similar to the context_vector if they appeared in the same context. A typical measure of similarity is the cosine similarity. Give two vectors A and B the cosine similarity is defined by the Euclidean Dot product of A and B normalized by their magnitude. As we don’t need the similarity to be normalized inside the network, we will only calculate the dot product and then output a dense layer with sigmoid activation.
#dot_product <-
 # layer_dot(list(target_vector, context_vector), axes = 1)
#dot_product
#https://betterexplained.com/articles/vector-calculus-understanding-the-dot-product/
#output <-
 # layer_dense(dot_product, units = 1, activation = "sigmoid")
#Now we will create the model and compile it.
#model <- keras_model(list(input_target, input_context), output)
#model %>% compile(loss = "binary_crossentropy", optimizer = "adam", metrics='accuracy')

#summary(model)

############################################################################
########
############################################################################
########
############################################################################
########
############################################################################
########
history <- model %>%
  fit_generator(
    skipgrams_generator(reviews, tokenizer, skip_window, negative_samples),
    steps_per_epoch = 100000,
    epochs = 10
  )
plot(history, method = "base")

#We can now extract the embeddings matrix from the model by using the get_weights() function. We also added row.names to our embedding matrix so we can easily find where each word is.
embedding_matrix <- get_weights(model)[[1]]
words <- data_frame(word = names(tokenizer$word_index),
                    id = as.integer(unlist(tokenizer$word_index)))

words <- words %>%
  filter(id <= tokenizer$num_words) %>%
  arrange(id)
row.names(embedding_matrix) <- c("UNK", words$word)

############################################################################
########
############################################################################
########
############################################################################
########

############################################################################
######## We can now find words that are close to each other in the embedding. We will use the cosine similarity, since this is what we trained the model to minimize.

library(text2vec)
find_similar_words <- function(word, embedding_matrix, n = 5) {
  similarities <- embedding_matrix[word, , drop = FALSE] %>%
    sim2(embedding_matrix, y = ., method = "cosine")
  similarities[, 1] %>% sort(decreasing = TRUE) %>% head(n)
}
find_similar_words("them",embedding_matrix,10)
############################################################################
########
############################################################################
########
############################################################################
########
############################################################################
######## The t-SNE algorithm can be used to visualize the embeddings. Because of time constraints we will only use it with the first 500 words. To understand more about the t-SNE method see the article How to Use t-SNE Effectively.
#This plot may look like a mess, but if you zoom into the small groups you end up seeing some nice patterns. Try, for example, to find a group of web related words like http, href, etc. Another group that may be easy to pick out is the pronouns group: she, he, her, etc.

library(Rtsne)
library(ggplot2)
library(plotly)
tsne <-
  Rtsne(embedding_matrix[2:700, ], perplexity = 50, pca = TRUE)
tsne_plot <- tsne$Y %>%
  as.data.frame() %>%
  mutate(word = row.names(embedding_matrix)[2:700]) %>%
  ggplot(aes(x = V1, y = V2, label = word)) +
  geom_text(size = 1)
tsne_plot
